KDD Cup 1999 - intrusion detection

http://kdd.ics.uci.edu/databases/kddcup99/task.html

Here is a paper that analyzes the dataset https://web.cs.dal.ca/~zincir/bildiri/pst05-gnm.pdf


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, cluster, \
    preprocessing, metrics, pipeline, tree, ensemble, decomposition

pd.options.display.max_columns = 1000
%matplotlib inline

In [6]:
num_cluster = 30

In [7]:
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]

columns.append("Category")
print(columns)


['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Category']

In [8]:
df = pd.read_csv("/data/kddcup.data", header=None, names=columns)

In [9]:
df.head()


Out[9]:
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_compromised root_shell su_attempted num_root num_file_creations num_shells num_access_files num_outbound_cmds is_host_login is_guest_login count srv_count serror_rate srv_serror_rate rerror_rate srv_rerror_rate same_srv_rate diff_srv_rate srv_diff_host_rate dst_host_count dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate Category
0 0 tcp http SF 215 45076 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0 0.0 0.0 0.00 0.0 0.0 0.0 0.0 0.0 normal.
1 0 tcp http SF 162 4528 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1 1 1.0 0.0 1.00 0.0 0.0 0.0 0.0 0.0 normal.
2 0 tcp http SF 236 1228 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2 2 1.0 0.0 0.50 0.0 0.0 0.0 0.0 0.0 normal.
3 0 tcp http SF 233 2032 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 3 3 1.0 0.0 0.33 0.0 0.0 0.0 0.0 0.0 normal.
4 0 tcp http SF 239 486 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 3 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 4 4 1.0 0.0 0.25 0.0 0.0 0.0 0.0 0.0 normal.

In [10]:
df.Category.value_counts()


Out[10]:
smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: Category, dtype: int64

Attacks fall into one of four categories: User to Root; Remote to Local; Denial of Service; and Probe.

  • Denial of Service (dos): Attacker tries to prevent legitimate users from using a service.
  • Remote to Local (r2l): Attacker does not have an account on the victim machine, hence tries to gain access.
  • User to Root (u2r): Attacker has local access to the victim machine and tries to gain super user privileges.
  • Probe: Attacker tries to gain information about the target host.

Mapping is below.


In [11]:
attack_types = {
 'normal.': "normal", 
 'buffer_overflow.':'u2r', 
 'loadmodule.':'u2r', 
 'perl.':'u2r', 
 'neptune.':'dos',
 'smurf.':'dos',
 'guess_passwd.':'r2l', 
 'pod.': 'dos', 
 'teardrop.':'dos',
 'portsweep.':'probe',
 'ipsweep.':'probe',
 'land.':'dos',
 'ftp_write.':'r2l',
 'back.': 'dos',
 'imap.': 'r2l',
 'satan.': 'probe',
 'phf.':'r2l',
 'nmap.':'probe',
 'multihop.':'r2l',
 'warezmaster.':'r2l',
 'warezclient.':'r2l',
 'spy.':'r2l',
 'rootkit.':'u2r'}

In [12]:
df["label"] = np.where(df.Category == "normal.", "normal", "attack")
df["attack_type"] = df.Category.apply(lambda r: attack_types[r])

In [13]:
df.label.value_counts()/df.shape[0]


Out[13]:
attack    0.80141
normal    0.19859
Name: label, dtype: float64

In [14]:
df.attack_type.value_counts(dropna=False)


Out[14]:
dos       3883370
normal     972781
probe       41102
r2l          1126
u2r            52
Name: attack_type, dtype: int64

In [15]:
df_num = df.select_dtypes(include=[np.float64, np.int64])
df_num.head()


Out[15]:
duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_compromised root_shell su_attempted num_root num_file_creations num_shells num_access_files num_outbound_cmds is_host_login is_guest_login count srv_count serror_rate srv_serror_rate rerror_rate srv_rerror_rate same_srv_rate diff_srv_rate srv_diff_host_rate dst_host_count dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate
0 0 215 45076 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0 0.0 0.0 0.00 0.0 0.0 0.0 0.0 0.0
1 0 162 4528 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1 1 1.0 0.0 1.00 0.0 0.0 0.0 0.0 0.0
2 0 236 1228 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2 2 1.0 0.0 0.50 0.0 0.0 0.0 0.0 0.0
3 0 233 2032 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 3 3 1.0 0.0 0.33 0.0 0.0 0.0 0.0 0.0
4 0 239 486 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 3 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 4 4 1.0 0.0 0.25 0.0 0.0 0.0 0.0 0.0

In [16]:
X = preprocessing.StandardScaler().fit_transform(df_num)

In [235]:
%%time
def display_2d(X, n_samples = 10000):
    pca = decomposition.PCA(n_components=2)
    pca_values = pca.fit_transform(X)
    X_pca = pca_values.copy()
    X_pca = pd.DataFrame(X_pca)
    X_pca["color"] = np.where(labels == "attack", "red", "green")
    X_sample = X_pca.sample(n_samples)
    colors = X_sample.color
    X_sample.plot.scatter(0, 1, color = colors)
    return pca_values

X_pca = display_2d(X)


CPU times: user 28.8 s, sys: 8.69 s, total: 37.5 s
Wall time: 28.4 s

In [237]:
%%time
y = preprocessing.LabelEncoder().fit_transform(df.label)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_pca, y, test_size = 0.3, random_state = 1)
est = tree.DecisionTreeClassifier(max_depth=5)
est.fit(X_train, y_train)
print("Accuracy:", est.score(X_test, y_test))


Accuracy: 0.9970405503800535
CPU times: user 9.49 s, sys: 251 ms, total: 9.74 s
Wall time: 9.74 s

In [242]:
est.feature_importances_


Out[242]:
array([0.2368105, 0.7631895])

In [39]:
pca = decomposition.PCA()
pca.fit(X)


Out[39]:
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [43]:
_, ax = plt.subplots(figsize = (10, 6))
pd.Series(pca.explained_variance_ratio_).plot.bar(ax = ax)
pd.Series(np.cumsum(pca.explained_variance_ratio_)).plot.line(ax = ax)


Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a167d2e10>

In [171]:
pd.DataFrame({"cumsum": np.cumsum(pca.explained_variance_ratio_)}).query("cumsum>=0.99").head()


Out[171]:
cumsum
24 0.992789

In [18]:
%%time
pca = decomposition.PCA(n_components=25)
X_pca = pca.fit_transform(X)


CPU times: user 1min 17s, sys: 56.8 s, total: 2min 13s
Wall time: 1min 54s

In [19]:
%%time
kmeans = cluster.MiniBatchKMeans(n_clusters=num_cluster)
y_cluster = kmeans.fit_predict(X_pca)


CPU times: user 15.6 s, sys: 892 ms, total: 16.5 s
Wall time: 9.23 s

In [20]:
pd.Series(y_cluster).value_counts()


Out[20]:
9     2262816
3      324108
26     284870
19     279067
8      266668
2      265703
4      251196
1      207936
16     128479
28     127851
11     109844
22      96891
18      91815
5       60104
21      44746
24      31088
0       28265
6       18491
25       8778
17       4996
27       2485
23       1035
7         651
12        548
dtype: int64

In [224]:
for i in range(num_cluster):
    print("Cluster: ", i, "")
    print(pd.Series(df.Category[y_cluster == i]).value_counts())
    print("\n")


Cluster:  0 
smurf.     2806103
normal.        304
Name: Category, dtype: int64


Cluster:  1 
neptune.      866160
portsweep.       201
normal.           65
imap.              2
Name: Category, dtype: int64


Cluster:  2 
neptune.      204570
portsweep.      7246
normal.         1882
ipsweep.         191
satan.             3
Name: Category, dtype: int64


Cluster:  3 
normal.          2411
warezclient.       19
ftp_write.          2
multihop.           2
guess_passwd.       1
warezmaster.        1
Name: Category, dtype: int64


Cluster:  4 
normal.             335198
ipsweep.               822
warezclient.           616
back.                  426
nmap.                   48
pod.                    23
warezmaster.            18
buffer_overflow.        12
satan.                   9
imap.                    9
neptune.                 9
rootkit.                 3
loadmodule.              3
multihop.                2
portsweep.               2
ftp_write.               1
smurf.                   1
Name: Category, dtype: int64


Cluster:  5 
normal.    1
Name: Category, dtype: int64


Cluster:  6 
normal.    26
Name: Category, dtype: int64


Cluster:  7 
normal.         19234
neptune.         1242
satan.            365
portsweep.        107
nmap.              18
ipsweep.            6
back.               6
warezclient.        5
Name: Category, dtype: int64


Cluster:  8 
normal.        350
loadmodule.      2
multihop.        1
spy.             1
Name: Category, dtype: int64


Cluster:  9 
normal.         288201
back.             1573
warezclient.        38
satan.               7
portsweep.           2
rootkit.             2
imap.                1
Name: Category, dtype: int64


Cluster:  10 
land.      21
normal.     7
Name: Category, dtype: int64


Cluster:  11 
normal.    2
Name: Category, dtype: int64


Cluster:  12 
teardrop.    970
Name: Category, dtype: int64


Cluster:  13 
portsweep.    1
Name: Category, dtype: int64


Cluster:  14 
normal.             222
buffer_overflow.     18
phf.                  4
perl.                 3
rootkit.              2
multihop.             1
loadmodule.           1
Name: Category, dtype: int64


Cluster:  15 
normal.    1
Name: Category, dtype: int64


Cluster:  16 
portsweep.    4
Name: Category, dtype: int64


Cluster:  17 
normal.       4567
ftp_write.       1
Name: Category, dtype: int64


Cluster:  18 
normal.         135
ipsweep.          1
warezmaster.      1
Name: Category, dtype: int64


Cluster:  19 
normal.         78097
nmap.             991
back.             190
ipsweep.           59
pod.               56
neptune.           20
warezclient.        8
satan.              3
portsweep.          2
ftp_write.          1
multihop.           1
Name: Category, dtype: int64


Cluster:  20 
normal.         1366
warezclient.     288
Name: Category, dtype: int64


Cluster:  21 
normal.         51339
ipsweep.           14
back.               8
portsweep.          2
neptune.            1
warezclient.        1
Name: Category, dtype: int64


Cluster:  22 
satan.        14007
portsweep.     1822
normal.           3
Name: Category, dtype: int64


Cluster:  23 
ipsweep.        11352
normal.          1806
pod.               94
warezclient.       40
neptune.            9
nmap.               8
portsweep.          4
ftp_write.          2
satan.              2
loadmodule.         2
Name: Category, dtype: int64


Cluster:  24 
normal.          8
satan.           1
guess_passwd.    1
Name: Category, dtype: int64


Cluster:  25 
normal.    49
Name: Category, dtype: int64


Cluster:  26 
normal.       6157
portsweep.     864
Name: Category, dtype: int64


Cluster:  27 
normal.          61
guess_passwd.    51
rootkit.          1
Name: Category, dtype: int64


Cluster:  28 
normal.       138963
smurf.          1782
satan.          1035
nmap.            230
pod.              80
portsweep.         9
teardrop.          7
rootkit.           2
spy.               1
Name: Category, dtype: int64


Cluster:  29 
normal.         42326
nmap.            1021
satan.            460
portsweep.        147
ipsweep.           36
pod.               11
neptune.            6
warezclient.        5
teardrop.           2
ftp_write.          1
loadmodule.         1
Name: Category, dtype: int64



In [178]:
distances = np.zeros([df.shape[0]])
for i in range(num_cluster):
    centroid = kmeans.cluster_centers_[i]
    distances[y_cluster==i] = np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1))
np.sort(distances)[::-1][:100]


Out[178]:
array([689.23443365, 621.3949061 , 621.39031029, 440.30982984,
       421.46888954, 406.4107485 , 393.79130488, 365.76066498,
       345.07124035, 291.67571775, 278.80037374, 277.30149122,
       277.21147369, 277.2062876 , 257.45249046, 232.46404824,
       225.9500618 , 219.42014587, 214.72560286, 189.18878055,
       184.23841188, 177.53093176, 176.07001054, 171.907339  ,
       171.5358193 , 169.93502893, 168.03925748, 167.75202767,
       167.60447674, 161.94584276, 161.81064075, 160.36691135,
       155.14661059, 154.30681006, 153.93479222, 153.92025291,
       149.13032213, 148.65314661, 145.30075983, 142.48819667,
       138.94519965, 138.73436868, 138.69564646, 138.62820071,
       137.87648857, 134.95281781, 133.36937893, 133.14764103,
       132.80457913, 131.49850543, 130.45669101, 130.10272335,
       129.55642448, 129.27921177, 128.89958802, 128.87403315,
       128.86896531, 128.837752  , 127.09822912, 126.74857916,
       123.80487929, 123.7788794 , 123.74921296, 123.72026691,
       123.71962593, 123.71714541, 123.71498633, 123.71491887,
       123.71399852, 123.71220955, 123.71117236, 123.71102159,
       123.70958796, 123.70921104, 123.707626  , 123.70543123,
       123.70424887, 123.70406444, 123.70299678, 123.70226455,
       123.70051127, 123.61000927, 123.38736955, 122.44027687,
       122.38480992, 122.37976551, 120.44710575, 120.13354292,
       119.92699374, 119.84170685, 119.82401765, 119.21953847,
       119.20255892, 116.99605438, 116.64435882, 116.4482475 ,
       116.20164051, 115.84430421, 115.75866591, 115.20034272])

In [180]:
np.sum(distances ** 2), kmeans.inertia_


Out[180]:
(12327843.447282968, 12327843.447282847)

In [ ]:
Average distance of a point to its closest centroid to within each cluster

In [201]:
cluster_avg_distances = []
for i in range(num_cluster):
    cluster_avg_distances.append(np.mean(distances[y_cluster == i]))
pd.Series(cluster_avg_distances).sort_values(ascending=False).plot.bar()


Out[201]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ac769d6a0>

In [203]:
cluster_max_distances = []
for i in range(num_cluster):
    cluster_max_distances.append(np.max(distances[y_cluster == i]))
pd.Series(cluster_max_distances).sort_values(ascending=False).plot.bar()


Out[203]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ac87e8588>

In [143]:
pd.Series(df.label[distances>113]).value_counts()


Out[143]:
normal    82
attack    21
Name: label, dtype: int64

In [145]:
plt.boxplot(distances);



In [162]:
def outliers(distances):
    q1, q3 = np.percentile(distances, [0.25, 0.75])
    iqr = q3-q1
    upper_whisker = q3 + 1.5 * iqr
    lower_whisker = q1 - 1.5 * iqr
    return (distances > upper_whisker) | (distances < lower_whisker)

In [166]:
pd.Series(outliers(distances)).value_counts()


Out[166]:
False    2610187
True     2288244
dtype: int64

In [122]:
np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1)).shape


Out[122]:
(2806407,)

In [175]:
X_pca[y_cluster==i].shape


Out[175]:
(44016, 25)

In [ ]: